This code follows the protocol for differential expression analysis of RNA sequencing data using R and Bioconductor given in Anders et al (2013).

# Load the data 
samples = read.table("samples.csv", sep=";", header=TRUE)
dim(samples)
## [1] 7 7
counts = read.csv("counts.csv")
dim(counts)
## [1] 7196    7
library(edgeR)
counts = readDGE(samples$countf)$counts
dim(counts)
## [1] 15686     7
# Filter weakly expressed and noninformative (e.g., non-aligned) features 
noint = rownames(counts) %in%
  c("no_feature","ambiguous","too_low_aQual",
    "not_aligned","alignment_not_unique")
cpms = cpm(counts)
counts.all <- counts[,order(samples$condition)]
colnames(counts.all) = samples$shortname[order(samples$condition)]
head(counts.all)
##             CT.PA.1 CT.PA.2 CT.SI.5 CT.SI.7 KD.PA.3 KD.PA.4 KD.SI.6
## FBgn0000008      76      71     137      82      87      68     115
## FBgn0000014       2       4       1       1       9       1       0
## FBgn0000015       1       2       0       0       3       1       2
## FBgn0000017    3498    3087    7014    3926    3029    3264    4322
## FBgn0000018     240     306     613     485     288     307     528
## FBgn0000022       0       0       1       0       0       0       0
# Keep only genes with at least 1 read per million, for the smallest 
# number of replicates in  treatment
keep = rowSums(cpms > 1) >= 3 & !noint
counts = counts.all[keep,]
#colnames(counts) = samples$shortname
head(counts, 5)
##             CT.PA.1 CT.PA.2 CT.SI.5 CT.SI.7 KD.PA.3 KD.PA.4 KD.SI.6
## FBgn0000008      76      71     137      82      87      68     115
## FBgn0000017    3498    3087    7014    3926    3029    3264    4322
## FBgn0000018     240     306     613     485     288     307     528
## FBgn0000032     611     672    1479    1351     694     757    1361
## FBgn0000042   40048   49144   97565   99372   70574   72850   95760
d = DGEList(counts = counts, group = samples$condition[order(samples$condition)])
str(d)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
##   ..@ .Data:List of 2
##   .. ..$ : num [1:7196, 1:7] 76 3498 240 611 40048 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:7196] "FBgn0000008" "FBgn0000017" "FBgn0000018" "FBgn0000032" ...
##   .. .. .. ..$ : chr [1:7] "CT.PA.1" "CT.PA.2" "CT.SI.5" "CT.SI.7" ...
##   .. ..$ :'data.frame':  7 obs. of  3 variables:
##   .. .. ..$ group       : Factor w/ 2 levels "CTL","KD": 1 1 1 1 2 2 2
##   .. .. ..$ lib.size    : num [1:7] 8397136 9909691 19087995 12812818 9664838 ...
##   .. .. ..$ norm.factors: num [1:7] 1 1 1 1 1 1 1
d = calcNormFactors(d)
str(d)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
##   ..@ .Data:List of 2
##   .. ..$ : num [1:7196, 1:7] 76 3498 240 611 40048 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:7196] "FBgn0000008" "FBgn0000017" "FBgn0000018" "FBgn0000032" ...
##   .. .. .. ..$ : chr [1:7] "CT.PA.1" "CT.PA.2" "CT.SI.5" "CT.SI.7" ...
##   .. ..$ :'data.frame':  7 obs. of  3 variables:
##   .. .. ..$ group       : Factor w/ 2 levels "CTL","KD": 1 1 1 1 2 2 2
##   .. .. ..$ lib.size    : num [1:7] 8397136 9909691 19087995 12812818 9664838 ...
##   .. .. ..$ norm.factors: num [1:7] 0.97 0.965 1.001 1.015 0.997 ...
d$samples
##         group lib.size norm.factors
## CT.PA.1   CTL  8397136    0.9702373
## CT.PA.2   CTL  9909691    0.9652457
## CT.SI.5   CTL 19087995    1.0009795
## CT.SI.7   CTL 12812818    1.0145053
## KD.PA.3    KD  9664838    0.9973330
## KD.PA.4    KD 10325828    1.0146062
## KD.SI.6    KD 15324886    1.0391230
par(pty="s")
d.mds <- plotMDS(d, labels = samples$shortname[order(samples$condition)],
       col = c("darkgreen","blue")[factor(samples$condition[order(samples$condition)])],
       gene.selection = "common", xlim=c(-1.1, 1.1))

# Check the normalization
library(ggplot2)
library(GGally)
d$counts <- data.frame(d$counts) # Needs to be a data frame
d$lcounts <- log(d$counts+1)
ggparcoord(d$lcounts, columns=1:7, boxplot=TRUE, scale="globalminmax", 
           showPoints=FALSE, alphaLines=0) + 
  xlab("") + ylab("log(cpm)") + 
  theme_bw()

# ggparcoord(d$lcounts, columns=1:7, alphaLines=0.1)

# Use trimmed mean normalization
d = calcNormFactors(d, method="TMM")
nc = data.frame(cpm(d, normalized.lib.sizes = TRUE, log=TRUE))
ggparcoord(nc, columns=1:7, boxplot=TRUE, scale="globalminmax", showPoints=FALSE,
           alphaLines=0) + 
  xlab("") + ylab("log(cpm)") + 
  theme_bw()

# Check that the MDS was conducted on the normalized data
d.mds2 <- plotMDS(nc, labels = samples$shortname[order(samples$condition)],
       col = c("darkgreen","blue")[factor(samples$condition[order(samples$condition)])],
       gene.selection = "common", xlim=c(-1.1, 1.1))

# Scatterplot matrix of normalized data
ggscatmat(nc)

# Full parallel coordinate plot
ggparcoord(nc, columns=1:7, scale="globalminmax", alphaLines=0.5) + 
  xlab("") + ylab("log(cpm)") + 
  theme_bw()

ggparcoord(nc, columns=1:7, scale="globalminmax", alphaLines=0.01) + 
  xlab("") + ylab("log(cpm)") + 
  theme_bw()

# Porcupine plot
ggplot(nc) + geom_segment(aes(x=CT.PA.1, xend=CT.PA.2, y=KD.PA.3, yend=KD.PA.4)) +
  theme_bw() + theme(aspect.ratio=1)

# Check the mean and variance relationship
# For the Poisson model, the mean = variance 
# RNA seq tends to be more overdispersed, variance is larger than expected
# Over-dispersed leads to fitting a negative binomial model
# Common dispersion would be used if all genes assumed to have same variance
d = estimateCommonDisp(d)
d$common.dispersion
## [1] 0.03329083
# Tagwise dispersion is a weighted average of individual gene dispersion 
# with common dispersion
d = estimateTagwiseDisp(d)
summary(d$tagwise.dispersion)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01647 0.02187 0.02589 0.03540 0.03599 1.67900
d$prior.n
## [1] 2
plotMeanVar(d, show.tagwise.vars = TRUE, NBline = TRUE)

plotBCV(d)

# Examine the dispersions, in relation to the means
mv <- binMeanVar(d, group = d$samples$group)
# $means are the means for each gene
# $vars are the pooled variances for each gene
qplot(mv$means, mv$vars, alpha=I(0.5)) + scale_x_log10() + scale_y_log10() + 
  geom_smooth(method="lm") + theme_bw() + theme(aspect.ratio=1)

qplot(mv$means, d$tagwise.dispersion) + scale_x_log10() + scale_y_log10() +
  geom_smooth() + theme_bw() + theme(aspect.ratio=1)

qplot(mv$vars, d$tagwise.dispersion) + scale_x_log10() + scale_y_log10() +
  geom_smooth() + theme_bw() + theme(aspect.ratio=1)

# Test for differential expression (‘classic’ edgeR)
de = exactTest(d, pair = c("CTL","KD"))
tt = topTags(de, n = nrow(d), sort.by="none")
nc.sig <- data.frame(gene=rownames(nc), nc, tt)
nc.sig$sig05 <- ifelse(nc.sig$FDR < 0.05, "S", "NS")
nc.sig$sig01 <- ifelse(nc.sig$FDR < 0.01, "S", "NS")

# Porcupine plot with significance
ggplot(nc.sig) + geom_segment(aes(x=CT.PA.1, xend=CT.PA.2, y=KD.PA.3, 
                                  yend=KD.PA.4, color=sig05)) +
  scale_color_manual(values=c("S"="red", "NS"="grey90")) + 
  xlab("CT") + ylab("KD") +
  theme_bw() + theme(aspect.ratio=1)

ggplot(nc.sig) + geom_segment(aes(x=CT.PA.1, xend=CT.PA.2, y=KD.PA.3, 
                                  yend=KD.PA.4, color=sig01)) +
  scale_color_manual(values=c("S"="red", "NS"="grey90")) + 
  xlab("CT") + ylab("KD") +
  theme_bw() + theme(aspect.ratio=1)

# MV plots
nc.sig$sig01 <- factor(nc.sig$sig01, levels=c("NS","S"))
ggscatmat(nc.sig, columns=2:8, color="sig01") +
  scale_color_manual(values=c("S"="red", "NS"="white"))

ggscatmat(nc.sig[nc.sig$sig01=="S",], columns=2:8)  

ggparcoord(nc.sig, columns=2:8, scale="globalminmax", alphaLines=0.5, 
           groupColumn="sig01") +
  scale_color_manual(values=c("S"="red", "NS"="grey90")) + 
  xlab("") + ylab("log(cpm)") + 
  theme_bw()

ggparcoord(nc.sig[nc.sig$sig01=="S",], columns=2:8, scale="globalminmax", 
           alphaLines=0.5) +
  xlab("") + ylab("log(cpm)") + 
  theme_bw()

# Interaction plots of top genes
library(tidyr)
library(dplyr)
nc.sig <- arrange(nc.sig, PValue)
g1 <- gather(nc.sig[1,2:8])
g1$trt <- substr(g1$key, 1, 2)
g1$read <- substr(g1$key, 4, 5)
g1.mean <- summarise(group_by(g1, trt), m=mean(value))
qplot(trt, value, data=g1, xlab="Treatment", ylab="logCPM", colour=read, 
      size=I(5), alpha=I(0.5)) +
  annotate("segment", x=1, xend=2, y=g1.mean$m[1], 
           yend=g1.mean$m[2], colour="grey80") + 
  ggtitle(nc.sig$gene[1]) +
  theme_bw() + theme(aspect.ratio=1)

table(nc.sig$sig01)
## 
##   NS    S 
## 6882  314
g314 <- gather(nc.sig[314,2:8])
g314$trt <- substr(g314$key, 1, 2)
g314$read <- substr(g314$key, 4, 5)
g314.mean <- summarise(group_by(g314, trt), m=mean(value))
qplot(trt, value, data=g314, xlab="Treatment", ylab="logCPM", colour=read, 
      size=I(5), alpha=I(0.5)) +
  annotate("segment", x=1, xend=2, y=g314.mean$m[1], 
           yend=g314.mean$m[2], colour="grey80") + 
  ggtitle(nc.sig$gene[314]) +
  theme_bw() + theme(aspect.ratio=1)

# To check the effect size, visually, we are going to scramble the 
# labels, and re-run the significance testing
dp <- d
ncp <- nc
dp$samples$group <- c("CTL","KD","CTL","KD","CTL","KD","CTL")
dp$samples$group
## [1] "CTL" "KD"  "CTL" "KD"  "CTL" "KD"  "CTL"
dep = exactTest(dp, pair = c("CTL","KD"))
ttp = topTags(dep, n = nrow(dp), sort.by="none")
ncp.sig <- data.frame(gene=rownames(ncp), ncp, ttp)
ncp.sig$sig05 <- ifelse(ncp.sig$FDR < 0.05, "S", "NS")
ncp.sig$sig01 <- ifelse(ncp.sig$FDR < 0.01, "S", "NS")
ncp.sig <- arrange(ncp.sig, PValue)
g1 <- gather(ncp.sig[1,2:8])
g1$trt <- dp$samples$group
g1.mean <- summarise(group_by(g1, trt), m=mean(value))
qplot(trt, value, data=g1, xlab="Treatment", ylab="logCPM",  
      size=I(5), alpha=I(0.5)) +
  annotate("segment", x=1, xend=2, y=g1.mean$m[1], 
           yend=g1.mean$m[2], colour="grey80") + 
  ggtitle(ncp.sig$gene[314]) +
  theme_bw() + theme(aspect.ratio=1)

table(ncp.sig$sig01)
## 
##   NS 
## 7196
g314 <- gather(ncp.sig[314,2:8])
g314$trt <- dp$samples$group
g314.mean <- summarise(group_by(g314, trt), m=mean(value))
qplot(trt, value, data=g314, xlab="Treatment", ylab="logCPM", 
      size=I(5), alpha=I(0.5)) +
  annotate("segment", x=1, xend=2, y=g314.mean$m[1], 
           yend=g314.mean$m[2], colour="grey80") + 
  ggtitle(ncp.sig$gene[314]) +
  theme_bw() + theme(aspect.ratio=1)

Bibliography

Anders, S., McCarthy, D. J., Chen, Y., Okoniewski, M., Smyth, G. K., Huber, W. and Robinson, M. D. (2013) “Count-based differential expression analysis of RNA sequencing data using R and Bioconductor”, Nature Protocols, 8(9):1765-1786, doi:10.1038/nprot.2013.099.